Setup

rm(list=ls())
knitr::opts_knit$set(root.dir = '~/OneDrive - University College London/Projects/Ostridge_PanAf/baypass/analysing_baypass_output/baypass_aux/')

Parameters

subsps='ce'
env_data='f_over_sum_known_trees'
habitat_col=c("f_over_sum_known_trees"='black', "f_over_sum_known_trees.rm_e.IssaValley"="purple")
#fprs=c(0.005, 0.001, 0.0005, 0.0001, 0.00005)
fprs=c(0.005, 0.001, 0.0005)
chr21=TRUE
flanks=1000

Issa Valley is a large outlier for forest-tree-percentage in the Central-Eastern dataset. Here I investigate how much it effects the results by removing it, running BayPass and comparing the results with the original analysis.

Library

library(data.table)
options(datatable.fread.datatable=FALSE)
library(dplyr)
library(ggplot2)
library(ggVennDiagram)
library(psych)
library(tidyverse)
library(cowplot)
library(reshape)
library(ggpattern)
# Call scripts containing custom functions 
source("../scripts/baypass_tools.R")
source("../baypass_core/scripts/candidate_allele_frequency_patterns.R")
source("scripts/baypass_aux_tools.R")   

Read in data

## - ce 
## Read chr21

rm Issa Valley

# Coverage
cov=list()
cov[['exome']]=fread(paste0("../../../allele_frequencies/exome/output/f5.ce.pops_minInd.6or50pct_missing.pops.0.3/f5.ce.pops.all.chrs_missing.pops.0.3_minMAC2_coverage"))
cov[['chr21']]=fread(paste0("../../../allele_frequencies/chr21/output/chr21.f7.ce.pops_minInd.6or50pct_missing.pops.0.3/chr21.f7.ce.pops_chr21_missing.pops.0.3_pop_minMAC2_coverage"))
for(data in names(cov)){
  cov[[data]]=cov[[data]][colnames(cov[[data]])!="e_IssaValley_coverage",]
  rownames(cov[[data]])=paste(gsub("^chr", "", cov[[data]]$chr), cov[[data]]$pos, sep="_")
  cov[[data]]=as.data.frame(rowSums(cov[[data]][grepl("_coverage$", colnames(cov[[data]]))], na.rm = T))
  colnames(cov[[data]])="coverage"
}

# Exome
betai.rm_IV=NULL
for(seed in c('', '_seed.100', '_seed.10k')){
  betai.tmp=fread(paste0("../../running_baypass/exome/output/f5.ce.pops_minInd.6or50pct_missing.pops.0.3_minMAC2.rm_e.IssaValley/auxi/f_over_sum_known_trees.rm_e.IssaValley/f5.ce.pops_missing.pops.0.3_minMAC2_", env_data, ".rm_e.IssaValley",seed,"_summary_betai.out"), select=c('MRK', 'M_Beta', 'BF(dB)'))
  if(is.null(betai.rm_IV)){
    betai.rm_IV=betai.tmp
  }else{
    betai.tmp=betai.tmp[c('M_Beta', 'BF(dB)')]
    if(seed=='_seed.100'){colnames(betai.tmp)=paste0(colnames(betai.tmp), ".r1")}
    if(seed=='_seed.10k'){colnames(betai.tmp)=paste0(colnames(betai.tmp), ".r2")}
    betai.rm_IV=cbind(betai.rm_IV, betai.tmp)
  }
}
## Get medians
betai.rm_IV$`M_Beta.median`=apply(betai.rm_IV[c('M_Beta', 'M_Beta.r1', 'M_Beta.r2')], 1, median, na.rm=T)
betai.rm_IV$`BF(dB).median`=apply(betai.rm_IV[c('BF(dB)', 'BF(dB).r1', 'BF(dB).r2')], 1, median, na.rm=T)

betai.rm_IV=merge(betai[[subsp]][c('MRK', 'chr', 'pos')], betai.rm_IV)
betai.rm_IV$chr_pos=paste(betai.rm_IV$chr, betai.rm_IV$pos, sep="_")
betai.rm_IV=merge(betai.rm_IV, cov[['exome']], by.x='chr_pos', by.y=0)
betai.rm_IV$COVARIABLE_name=paste0(env_data,'.rm_e.IssaValley')
betai[[subsp]]=rbind(betai[[subsp]][colnames(betai.rm_IV)], betai.rm_IV)

colnames(betai.rm_IV)[!colnames(betai.rm_IV) %in% colnames(betai[[subsp]])]
## character(0)
# Chr21
betai_chr21.rm_IV=NULL
for(seed in c('', '_seed.100', '_seed.10k')){
  betai_chr21.tmp=fread(paste0("../../running_baypass/chr21/output/chr21.f7.ce.pops_minInd.6or50pct_missing.pops.0.3_minMAC2.non-genic_1000bp.flanks.rm_e.IssaValley/auxi/", env_data, ".rm_e.IssaValley/chr21.f7.ce.pops_missing.pops.0.3_minMAC2.non-genic_1000bp.flanks.", env_data, ".rm_e.IssaValley",seed,"_summary_betai.out"), select=c('MRK', 'M_Beta', 'BF(dB)'))
  if(is.null(betai_chr21.rm_IV)){
    betai_chr21.rm_IV=betai_chr21.tmp
  }else{
    betai_chr21.tmp=betai_chr21.tmp[c('M_Beta', 'BF(dB)')]
    if(seed=='_seed.100'){colnames(betai_chr21.tmp)=paste0(colnames(betai_chr21.tmp), ".r1")}
    if(seed=='_seed.10k'){colnames(betai_chr21.tmp)=paste0(colnames(betai_chr21.tmp), ".r2")}
    betai_chr21.rm_IV=cbind(betai_chr21.rm_IV, betai_chr21.tmp)
  }
}
## Get medians
betai_chr21.rm_IV$`M_Beta.median`=apply(betai_chr21.rm_IV[c('M_Beta', 'M_Beta.r1', 'M_Beta.r2')], 1, median, na.rm=T)
betai_chr21.rm_IV$`BF(dB).median`=apply(betai_chr21.rm_IV[c('BF(dB)', 'BF(dB).r1', 'BF(dB).r2')], 1, median, na.rm=T)

betai_chr21.rm_IV=merge(betai_chr21[[subsp]][c('MRK', 'chr', 'pos')], betai_chr21.rm_IV)
betai_chr21.rm_IV$chr_pos=paste(betai_chr21.rm_IV$chr, betai_chr21.rm_IV$pos, sep="_")
betai_chr21.rm_IV=merge(betai_chr21.rm_IV, cov[['chr21']], by.x='chr_pos', by.y=0)
betai_chr21.rm_IV$COVARIABLE_name=paste0(env_data,'.rm_e.IssaValley')
betai_chr21[[subsp]]=rbind(betai_chr21[[subsp]][colnames(betai_chr21.rm_IV)], betai_chr21.rm_IV)

cov.in.tmp=cov.in[[subsp]][['Real']]
rownames(cov.in.tmp)=paste0(env_data,'.rm_e.IssaValley')
cov.in[[subsp]][['Real']]=rbind(cov.in[[subsp]][['Real']], cov.in.tmp)

Volcano Plot

## ce
## Warning in max(betai[[y]]): no non-missing arguments to max; returning -Inf
## Warning in min(betai[[y]]): no non-missing arguments to min; returning Inf
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 2 rows containing missing values (`geom_step()`).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 2 rows containing missing values (`geom_step()`).
## Warning: Transformation introduced infinite values in continuous y-axis

## ce
## Warning in max(betai[[y]]): no non-missing arguments to max; returning -Inf
## Warning in min(betai[[y]]): no non-missing arguments to min; returning Inf
## Warning: Removed 2 rows containing missing values (`geom_step()`).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 2 rows containing missing values (`geom_step()`).
## Warning: Transformation introduced infinite values in continuous y-axis

I think this is exactly what we expect to see if there is no (or very little selection) in non-genic chr21 i.e. basically no difference in the distributions for non-genic-chr21 (as there should be not selection in either) - the small shift to lower BFs in rm Issa Valley due to the small number of non-coding SNPs under selection. The massive shift to lower values in the exome is due to the vast reduction in signatures of selection associated with habitat type now that we have no savannah-like population.

Null vs real data

## Null is chr21
## ce

Select candidates

Calculate FPR per coverage bin

We found that candidates were more likely to have lower coverages than expected from the background. We therefore decided to scalculate FPRs within candidate bins to account for this.

FPRs are estimated using a null distribution. SNPs in the null distribution are assigned empirical p values (BF rank/total number of SNPs). The null data is then combined with the exome data and ordered by BF. The FPR for each of the exome SNP is given as the smallest empirical p value of any null SNP with a lower or equal BF.

Example (winthin a single coverage bin):
Null BFs:   0   1   1   4   8
  Ranks:    5   4   4   2   1
  p:        1   0.8 0.8 0.4 0.2
  
Exome BFs:  0   2   3   9   10
  FDR:      1   0.8 0.8 0.2 0.2

We can see that in tied instances we use the lowest rank and that the minimum FPR possible is determined by the number of null SNPs.

This is done for each coverage bin.

## Null is chr21
##  [1] chr_pos         MRK             chr             pos            
##  [5] M_Beta          BF(dB)          M_Beta.r1       BF(dB).r1      
##  [9] M_Beta.r2       BF(dB).r2       M_Beta.median   BF(dB).median  
## [13] coverage        COVARIABLE_name
## <0 rows> (or 0-length row.names)
##  [1] chr_pos         MRK             chr             pos            
##  [5] M_Beta          BF(dB)          M_Beta.r1       BF(dB).r1      
##  [9] M_Beta.r2       BF(dB).r2       M_Beta.median   BF(dB).median  
## [13] coverage        COVARIABLE_name
## <0 rows> (or 0-length row.names)
## Coverage Bin Stats
## ce 
## Number of Candidates

## forest
## Coverage Bin Stats
## ce 
## Number of Candidates

## savannah
## Coverage Bin Stats
## ce 
## Number of Candidates

## Coverage Bin Stats
## ce 
## Number of Candidates

## forest
## Coverage Bin Stats
## ce 
## Number of Candidates

## savannah
## Coverage Bin Stats
## ce 
## Number of Candidates

Candidate volcano plots

There are a number of candidates with abs(betai) < 0.1 (> 0.1 is considered large effect) at less stringent thresholds in western and particularly in all samples.

Most candidates have abs(betai) > 0.1 for central-eastern, I expect this is because of the effect of Issa Valley.

Candidate Overlap

Distribution of candidates from full analysis in rm Issa Valley

Allele frequencies

Real env

Rank env

Here I plot the rank order of the environmental variables instead of the real values.

Ignore Issa Valley

Issa Valley is a forest tree % outlier, here I make a plot the results from the full analyis by just removed Issa Valley from the figure to see if we still see a correlation.